import streamlit as st
import datetime as dt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import clickhouse_driver
from clickhouse_driver.client import Client
from pandas_profiling import ProfileReport
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
clickhouse = Client('85.193.83.20', database = 'hn_launches',
user = 'admin', password = '0987654321')
2022-06-22 18:17:01.223 INFO visions.backends: Pandas backend loaded 1.4.0 2022-06-22 18:17:01.278 INFO visions.backends: Numpy backend loaded 1.22.1 2022-06-22 18:17:01.280 INFO visions.backends: Pyspark backend NOT loaded 2022-06-22 18:17:01.281 INFO visions.backends: Python backend loaded
#Quering data from Clickhouse
launches_metrics = clickhouse.query_dataframe("""
select l.item_id as item_id, max(date(l.time)) as date, max(name) as name,
max(yc_batch) as yc_batch,
toInt8(max(replaceRegexpAll(l.yc_batch, 'S|W| Nonprofit', ''))) as yc_batch_y,
max(short_description) as short_description,
max(industry) as industry, max(is_oss) as is_oss,
max(length(text)) as text_len,
max(score) as score,
uniq(comment_id) as comments_qty,
uniqIf(comment_id, sentiment == 'pos') as comments_pos_qty,
uniqIf(comment_id, sentiment == 'neg') as comments_neg_qty,
uniqIf(comment_id, sentiment == 'neu') as comments_neu_qty,
max(employees) as employees, max(estimated_revenue) as revenue, max(total_funding) as total_funding, max(github_stars) as github_stars
from hn_launches.launches l
left join hn_launches.comments c on c.item_id = l.item_id
left join hn_launches.company_growth g on g.item_id = l.item_id
group by item_id;
""")
launches_metrics['date'] = pd.to_datetime(launches_metrics['date'])
launches_metrics.info()
launches_metrics.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 291 entries, 0 to 290 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 item_id 291 non-null int64 1 date 291 non-null datetime64[ns] 2 name 291 non-null object 3 yc_batch 291 non-null object 4 yc_batch_y 291 non-null int64 5 short_description 291 non-null object 6 industry 291 non-null object 7 is_oss 291 non-null int64 8 text_len 291 non-null int64 9 score 291 non-null int64 10 comments_qty 291 non-null int64 11 comments_pos_qty 291 non-null int64 12 comments_neg_qty 291 non-null int64 13 comments_neu_qty 291 non-null int64 14 employees 291 non-null int64 15 revenue 291 non-null int64 16 total_funding 291 non-null float64 17 github_stars 291 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(12), object(4) memory usage: 41.0+ KB
| item_id | date | name | yc_batch | yc_batch_y | short_description | industry | is_oss | text_len | score | comments_qty | comments_pos_qty | comments_neg_qty | comments_neu_qty | employees | revenue | total_funding | github_stars | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 26468204 | 2021-03-15 | Mezli | W21 | 21 | Robotic restaurants that serve healthy fast food | Food | 0 | 5637 | 201 | 127 | 69 | 9 | 49 | 15 | 2430000 | 0.0 | 0 |
| 1 | 24158509 | 2020-08-14 | Tella | S20 | 20 | Collaborative video editing in the browser | SaaS | 0 | 4601 | 221 | 81 | 54 | 2 | 25 | 0 | 0 | 0.0 | 0 |
| 2 | 28877003 | 2021-10-15 | Inflow | S21 | 21 | Self-help app for people with adhd | Consumer | 0 | 2930 | 183 | 249 | 116 | 24 | 109 | 0 | 0 | 0.0 | 0 |
| 3 | 25991485 | 2021-02-01 | Opstrace | S19 | 19 | Open-source datadog | DevTools | 1 | 4994 | 316 | 114 | 78 | 4 | 32 | 0 | 0 | 0.0 | 1188 |
| 4 | 23846186 | 2020-07-15 | Legacy | S19 | 19 | We help men test and freeze their sperm | Consumer | 0 | 3672 | 193 | 92 | 48 | 5 | 39 | 88 | 12320000 | 20.2 | 0 |
launches_metrics.describe()
| item_id | yc_batch_y | is_oss | text_len | score | comments_qty | comments_pos_qty | comments_neg_qty | comments_neu_qty | employees | revenue | total_funding | github_stars | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.910000e+02 | 291.000000 | 291.000000 | 291.000000 | 291.000000 | 291.000000 | 291.000000 | 291.000000 | 291.000000 | 291.000000 | 2.910000e+02 | 2.910000e+02 | 291.000000 |
| mean | 2.743650e+07 | 20.621993 | 0.099656 | 3894.986254 | 140.762887 | 75.615120 | 43.615120 | 4.632302 | 27.367698 | 29.292096 | 6.378193e+06 | 5.154689e+05 | 128.189003 |
| std | 2.150811e+06 | 1.300423 | 0.300057 | 1367.983439 | 87.503589 | 53.233671 | 27.680996 | 5.711952 | 23.012115 | 135.469709 | 4.294584e+07 | 8.793155e+06 | 1002.542634 |
| min | 2.377021e+07 | 8.000000 | 0.000000 | 75.000000 | 27.000000 | 6.000000 | 4.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| 25% | 2.607926e+07 | 20.000000 | 0.000000 | 3109.000000 | 79.000000 | 37.000000 | 24.000000 | 1.000000 | 11.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| 50% | 2.742374e+07 | 21.000000 | 0.000000 | 3869.000000 | 117.000000 | 61.000000 | 38.000000 | 3.000000 | 20.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000 |
| 75% | 2.914154e+07 | 21.000000 | 0.000000 | 4693.500000 | 183.500000 | 92.500000 | 54.500000 | 6.000000 | 35.000000 | 15.500000 | 1.475250e+06 | 0.000000e+00 | 0.000000 |
| max | 3.172484e+07 | 22.000000 | 1.000000 | 8363.000000 | 617.000000 | 251.000000 | 159.000000 | 32.000000 | 109.000000 | 1696.000000 | 6.529600e+08 | 1.500000e+08 | 12753.000000 |
profile = ProfileReport(launches_metrics, title = 'HN Launches Profiling Report')
profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
metric = 'score'
px.box(launches_metrics, y = metric, color = 'industry',
title = f'Boxplot of {metric}')
metric = 'score'
#industries = launches_metrics['industry'].unique().tolist()
#me = [launches_metrics[launches_metrics['industry'] == i][metric].values for i in industries]
fig = ff.create_distplot([launches_metrics[metric].values], [metric], show_hist = False)
#fig = ff.create_distplot(me, industries, show_hist = False)
fig.update_layout(width = 700, height = 600, title = f'Distplot of {metric}')
fig.show()
px.bar(launches_metrics[(launches_metrics['yc_batch_y'].isin([19, 20, 21])) & (~launches_metrics['yc_batch'].str.contains('Nonprofit'))] \
.groupby(['yc_batch', 'yc_batch_y', 'industry'])['item_id'] \
.nunique().reset_index().sort_values('yc_batch_y', ascending = False).rename(columns = {'item_id': 'launches'}),
x = 'launches', y = 'yc_batch', color = 'industry', title = 'YC Batches Launches')
px.line(launches_metrics[launches_metrics['yc_batch_y'].isin([19, 20, 21])].groupby(pd.Grouper(key = 'date', freq = 'M', axis = 0))['item_id'] \
.nunique().reset_index().rename(columns = {'item_id': 'launches'}),
x = 'date', y = 'launches', title = 'YC Launches')
px.bar(launches_metrics.groupby(['industry'])['score'].mean().reset_index() \
.sort_values('score', ascending = False),
x = 'score', y = 'industry', color = 'industry', title = 'Top Scored Industries')
px.bar(launches_metrics.groupby(['industry'])['comments_qty'].mean().reset_index() \
.sort_values('comments_qty', ascending = False),
x = 'comments_qty', y = 'industry', color = 'industry', title = 'Top Commented Industries')
px.bar(launches_metrics.groupby(['industry'])['comments_neg_qty'].mean().reset_index() \
.sort_values('comments_neg_qty', ascending = False),
x = 'comments_neg_qty', y = 'industry', color = 'industry', title = '(anti)Top Negative Commented Industries')
metric = 'score'
px.bar(launches_metrics.groupby(['name'])[metric].max().reset_index().sort_values(metric, ascending = False)[:100],
y = metric, x = 'name', title = f'Top Launches by {metric}')
px.bar(launches_metrics.groupby(['name'])[['comments_qty', 'comments_neu_qty', 'comments_pos_qty', 'comments_neg_qty']].max().reset_index() \
.sort_values('comments_qty', ascending = False)[:100],
y = ['comments_neu_qty', 'comments_pos_qty', 'comments_neg_qty'], x = 'name',
color_discrete_map = {'comments_neu_qty': '#636EFA', 'comments_pos_qty': '#00CC96', 'comments_neg_qty': '#EF553B'},
title = f'Top Commented Lauhches')
metric_x = 'score'; metric_y = 'comments_pos_qty'; metric_size = 'text_len'
px.scatter(launches_metrics, x = metric_x, y = metric_y, color = 'industry', size = metric_size,
title = f'{metric_x} vs {metric_y}, size: {metric_size}')
corr = launches_metrics[['text_len', 'score',
'comments_qty', 'comments_pos_qty', 'comments_neg_qty',
'employees', 'revenue', 'total_funding', 'github_stars']].corr().round(2)
go.Figure(go.Heatmap(z = corr.values, x = corr.columns, y = corr.columns,
text = corr.values, texttemplate = '%{text}',
colorscale = px.colors.diverging.RdBu,
zmin = -1, zmax = 1)).show()